In [ ]:
import pandas as pd
import numpy as np
import statistics #mathematical statistics
import matplotlib.pyplot as plt
import plotly.express as px #create entire figures
import seaborn as sns #data visuals

#html export
import plotly.io as pio
pio.renderers.default = 'notebook'
In [ ]:
df_psy = pd.read_csv('psy_data_setEN.csv')
df_psy
Out[ ]:
Person age married orientation religion Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11
0 1 16 Never Hetero Other 1.0 7.0 7.0 7.0 7.0 5.0 1.0 4.0 2.0 4.0 4.0
1 2 16 Never New Types Christian Other 6.0 4.0 7.0 4.0 7.0 7.0 1.0 4.0 2.0 4.0 4.0
2 3 17 Never Homo Christian 2.0 2.0 2.0 6.0 5.0 5.0 3.0 3.0 4.0 3.0 4.0
3 4 13 Never New Types Christian 1.0 7.0 4.0 4.0 6.0 1.0 6.0 2.0 2.0 2.0 1.0
4 5 19 Never Hetero Muslim 2.0 3.0 6.0 5.0 5.0 6.0 3.0 2.0 3.0 4.0 4.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
39774 39772 21 Never New Types Muslim 4.0 5.0 7.0 6.0 4.0 7.0 4.0 3.0 3.0 4.0 4.0
39775 39773 48 Married Hetero Christian Other 6.0 7.0 5.0 3.0 6.0 1.0 5.0 2.0 2.0 1.0 2.0
39776 39774 20 Never Hetero Protestant 1.0 5.0 7.0 5.0 3.0 5.0 3.0 3.0 2.0 4.0 4.0
39777 39775 26 Never Hetero Muslim 6.0 3.0 5.0 3.0 5.0 5.0 1.0 2.0 2.0 1.0 3.0
39778 39775 26 Never Hetero Muslim 6.0 3.0 5.0 3.0 5.0 5.0 1.0 2.0 2.0 1.0 3.0

39779 rows × 16 columns

Data Wrangling involves:

  1. Dropping Columns
  2. Null values
  3. Duplicated Values
  4. Outliers
  5. Calitative Variables
In [ ]:
#dropping person column
df_psy = df_psy.drop(df_psy.columns[0], axis =1)
df_psy
Out[ ]:
age married orientation religion Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11
0 16 Never Hetero Other 1.0 7.0 7.0 7.0 7.0 5.0 1.0 4.0 2.0 4.0 4.0
1 16 Never New Types Christian Other 6.0 4.0 7.0 4.0 7.0 7.0 1.0 4.0 2.0 4.0 4.0
2 17 Never Homo Christian 2.0 2.0 2.0 6.0 5.0 5.0 3.0 3.0 4.0 3.0 4.0
3 13 Never New Types Christian 1.0 7.0 4.0 4.0 6.0 1.0 6.0 2.0 2.0 2.0 1.0
4 19 Never Hetero Muslim 2.0 3.0 6.0 5.0 5.0 6.0 3.0 2.0 3.0 4.0 4.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
39774 21 Never New Types Muslim 4.0 5.0 7.0 6.0 4.0 7.0 4.0 3.0 3.0 4.0 4.0
39775 48 Married Hetero Christian Other 6.0 7.0 5.0 3.0 6.0 1.0 5.0 2.0 2.0 1.0 2.0
39776 20 Never Hetero Protestant 1.0 5.0 7.0 5.0 3.0 5.0 3.0 3.0 2.0 4.0 4.0
39777 26 Never Hetero Muslim 6.0 3.0 5.0 3.0 5.0 5.0 1.0 2.0 2.0 1.0 3.0
39778 26 Never Hetero Muslim 6.0 3.0 5.0 3.0 5.0 5.0 1.0 2.0 2.0 1.0 3.0

39779 rows × 15 columns

In [ ]:
#duplicated values
duplicates = sum(df_psy.duplicated())
duplicates
Out[ ]:
16
In [ ]:
#Number of rows on data
len(df_psy)
Out[ ]:
39779
In [ ]:
#Percentage of duplicated values
duplicates/len(df_psy)*100
Out[ ]:
0.04022222780864275

<1% of data is duplicated hence safe to delete these duplicates

In [ ]:
# duplicates
duplicates = df_psy[df_psy.duplicated]
duplicates
Out[ ]:
age married orientation religion Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11
5840 20 Never Hetero Muslim 4.0 7.0 1.0 5.0 6.0 1.0 6.0 1.0 1.0 1.0 1.0
5958 19 Never New Types Muslim 5.0 6.0 7.0 7.0 7.0 7.0 4.0 4.0 3.0 2.0 4.0
8778 20 Never New Types Muslim 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
13331 18 Never Hetero Muslim 1.0 1.0 7.0 7.0 7.0 7.0 1.0 4.0 4.0 4.0 4.0
19915 23 Never Hetero Muslim 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 1.0 1.0 2.0
23087 18 Never Hetero Muslim 1.0 1.0 7.0 7.0 7.0 7.0 1.0 4.0 4.0 4.0 4.0
23201 18 Never Hetero Muslim 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 1.0 1.0 1.0
24848 22 Never New Types Muslim 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
26074 28 Never Hetero Muslim 6.0 6.0 5.0 4.0 6.0 5.0 6.0 2.0 1.0 1.0 2.0
28382 18 Never New Types Muslim 1.0 1.0 1.0 1.0 1.0 1.0 1.0 4.0 4.0 4.0 4.0
32458 23 Never Hetero Muslim 5.0 6.0 6.0 5.0 5.0 5.0 5.0 3.0 2.0 2.0 3.0
38016 21 Never New Types Muslim 1.0 1.0 1.0 1.0 1.0 1.0 1.0 3.0 2.0 2.0 2.0
38951 19 Never Hetero Muslim 2.0 2.0 7.0 6.0 5.0 7.0 2.0 4.0 4.0 4.0 4.0
39148 18 Never New Types Muslim 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 1.0 1.0 1.0
39753 15 Never Hetero Jewish 3.0 6.0 6.0 6.0 3.0 2.0 2.0 3.0 4.0 4.0 4.0
39778 26 Never Hetero Muslim 6.0 3.0 5.0 3.0 5.0 5.0 1.0 2.0 2.0 1.0 3.0
In [ ]:
#dropping duplicates
df_psy = df_psy.drop_duplicates()
In [ ]:
#Number of current duplicates
duplicates = sum(df_psy.duplicated())
duplicates
Out[ ]:
0
In [ ]:
df_psy
Out[ ]:
age married orientation religion Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11
0 16 Never Hetero Other 1.0 7.0 7.0 7.0 7.0 5.0 1.0 4.0 2.0 4.0 4.0
1 16 Never New Types Christian Other 6.0 4.0 7.0 4.0 7.0 7.0 1.0 4.0 2.0 4.0 4.0
2 17 Never Homo Christian 2.0 2.0 2.0 6.0 5.0 5.0 3.0 3.0 4.0 3.0 4.0
3 13 Never New Types Christian 1.0 7.0 4.0 4.0 6.0 1.0 6.0 2.0 2.0 2.0 1.0
4 19 Never Hetero Muslim 2.0 3.0 6.0 5.0 5.0 6.0 3.0 2.0 3.0 4.0 4.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
39773 16 Never Asexual Atheist 2.0 3.0 5.0 5.0 5.0 3.0 3.0 2.0 3.0 3.0 4.0
39774 21 Never New Types Muslim 4.0 5.0 7.0 6.0 4.0 7.0 4.0 3.0 3.0 4.0 4.0
39775 48 Married Hetero Christian Other 6.0 7.0 5.0 3.0 6.0 1.0 5.0 2.0 2.0 1.0 2.0
39776 20 Never Hetero Protestant 1.0 5.0 7.0 5.0 3.0 5.0 3.0 3.0 2.0 4.0 4.0
39777 26 Never Hetero Muslim 6.0 3.0 5.0 3.0 5.0 5.0 1.0 2.0 2.0 1.0 3.0

39763 rows × 15 columns

There is a reduction in number of rows due to deletion of duplicated values

In [ ]:
#info on the data
df_psy.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 39763 entries, 0 to 39777
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          39763 non-null  int64  
 1   married      39763 non-null  object 
 2   orientation  39762 non-null  object 
 3   religion     39761 non-null  object 
 4   Q1           39761 non-null  float64
 5   Q2           39758 non-null  float64
 6   Q3           39759 non-null  float64
 7   Q4           39761 non-null  float64
 8   Q5           39759 non-null  float64
 9   Q6           39759 non-null  float64
 10  Q7           39759 non-null  float64
 11  Q8           39758 non-null  float64
 12  Q9           39759 non-null  float64
 13  Q10          39760 non-null  float64
 14  Q11          39761 non-null  float64
dtypes: float64(11), int64(1), object(3)
memory usage: 4.9+ MB
In [ ]:
null = df_psy[df_psy.isna().any(axis = 1)]#shows all null values

1. df_psy.isna() returns a DataFrame of the same shape as df_psy, where each element is either True if the corresponding element in df_psy is NaN or null, or False otherwise. This operation checks for missing values in each element of the DataFrame.

2. .any(axis=1) is used to check if there are any True values across each row. By specifying axis=1, it checks for the presence of at least one True value in each row. The result is a boolean Series with the same number of rows as df_psy, where each element indicates whether the corresponding row has any missing values.

3. df_psy[df_psy.isna().any(axis=1)] is indexing df_psy using the boolean Series from the previous step. It selects only those rows from df_psy where the corresponding value in the boolean Series is True, meaning that row has at least one missing value.

The result is assigned to null, which will be another DataFrame containing only the rows from df_psy that have missing values in any of their elements.

In [ ]:
null
Out[ ]:
age married orientation religion Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11
90 15 Never New Types NaN 1.0 NaN NaN 6.0 NaN 6.0 NaN NaN NaN 4.0 4.0
137 20 Never Hetero Atheist NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
153 24 Never Homo Hindu 5.0 NaN 4.0 5.0 1.0 NaN NaN NaN NaN NaN NaN
225 19 Never NaN NaN 2.0 NaN NaN 6.0 NaN NaN 3.0 NaN 2.0 NaN 3.0
257 15 Never Homo Jewish NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.0 2.0
In [ ]:
#dropping null values
df_psy = df_psy.dropna()
In [ ]:
df_psy
Out[ ]:
age married orientation religion Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11
0 16 Never Hetero Other 1.0 7.0 7.0 7.0 7.0 5.0 1.0 4.0 2.0 4.0 4.0
1 16 Never New Types Christian Other 6.0 4.0 7.0 4.0 7.0 7.0 1.0 4.0 2.0 4.0 4.0
2 17 Never Homo Christian 2.0 2.0 2.0 6.0 5.0 5.0 3.0 3.0 4.0 3.0 4.0
3 13 Never New Types Christian 1.0 7.0 4.0 4.0 6.0 1.0 6.0 2.0 2.0 2.0 1.0
4 19 Never Hetero Muslim 2.0 3.0 6.0 5.0 5.0 6.0 3.0 2.0 3.0 4.0 4.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
39773 16 Never Asexual Atheist 2.0 3.0 5.0 5.0 5.0 3.0 3.0 2.0 3.0 3.0 4.0
39774 21 Never New Types Muslim 4.0 5.0 7.0 6.0 4.0 7.0 4.0 3.0 3.0 4.0 4.0
39775 48 Married Hetero Christian Other 6.0 7.0 5.0 3.0 6.0 1.0 5.0 2.0 2.0 1.0 2.0
39776 20 Never Hetero Protestant 1.0 5.0 7.0 5.0 3.0 5.0 3.0 3.0 2.0 4.0 4.0
39777 26 Never Hetero Muslim 6.0 3.0 5.0 3.0 5.0 5.0 1.0 2.0 2.0 1.0 3.0

39758 rows × 15 columns

There is a reduction in number of rows due to deletion of null values from the Dataframe

OUTLIERS

Outliers are the extreme values within the dataset that deviates significantly from the rest of the (so-called normal)objects—either being much larger or significantly smaller.

Outliers can be detected using

  1. *Visualization* i.e. Box Plots- a graphical representation that displays the distribution of numerical data through quartiles. & Scatter Plots
  2. *Implementing mathematical formulas on the dataset*
  3. *Using the statistical approach* e.g IQR (Inter Quartile Range)
In [ ]:
#using boxplot to find outlier for Q8
var = df_psy['Q8']
plt.boxplot(var)
plt.show()

There are no outliers

In [ ]:
px.box(var, points = 'all')# is using the Plotly Express library in Python to create a box plot with all data points displayed

Based on the box plot using Plotly:

  1. Bottom horizontal line of box plot is minimum value which is 1
  2. First horizontal line of rectangle shape of box plot is First quartile or 25% which is 2
  3. Second horizontal line of rectangle shape of box plot is Second quartile or 50% or median which is 3
  4. Third horizontal line of rectangle shape of box plot is third quartile or 75% which is 4
  5. Top horizontal line of rectangle shape of box plot is maximum value which is still 4
  6. Anything outside blue box plot is outlier data or erroneous data hence no outliers.
In [ ]:
num = df_psy.drop(['married', 'orientation', 'religion'], axis=1)
fig = px.box(num.melt(),y = "value", facet_col='variable', boxmode= 'overlay', color = 'variable')#plotting outliers for all variables
fig.update_yaxes(matches = None)

Outliers are in age, Q4, Q5 & Q11

  1. Q4 min-1 max-7 outlier-33
  2. Q5 min-1 max-7 outlier-71
  3. Q11 min-1 max-4 outlier-30
In [ ]:
#finding maximum age
max_age = df_psy['age'].max()
max_age
Out[ ]:
1998

This age is impractical

In [ ]:
max_age = df_psy[df_psy['age']>100] #finding age outliers
max_age
Out[ ]:
age married orientation religion Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11
5342 223 Never Hetero Muslim 7.0 6.0 3.0 2.0 1.0 2.0 6.0 2.0 1.0 1.0 1.0
10233 1996 Never New Types Muslim 5.0 7.0 7.0 7.0 5.0 3.0 5.0 4.0 4.0 3.0 4.0
14238 117 Never Homo Christian Other 3.0 1.0 7.0 6.0 3.0 6.0 2.0 4.0 3.0 4.0 3.0
21385 1998 Never Asexual Muslim 1.0 1.0 6.0 7.0 1.0 1.0 1.0 2.0 1.0 2.0 2.0
24900 115 Never Homo Christian 5.0 3.0 6.0 6.0 5.0 6.0 2.0 3.0 2.0 1.0 2.0
30029 1993 Married Hetero Muslim 4.0 7.0 7.0 7.0 7.0 6.0 2.0 4.0 4.0 3.0 4.0
33732 1991 Never Hetero Atheist 2.0 7.0 7.0 5.0 6.0 6.0 2.0 3.0 3.0 3.0 4.0
In [ ]:
df_psy = df_psy[df_psy['age']<100] #deletes the 7 rows which are age outliers
In [ ]:
Q = df_psy[df_psy['Q4']>8] #finding outlier in Q4
Q
Out[ ]:
age married orientation religion Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11
34616 21 Never Hetero Atheist 3.0 7.0 3.0 33.0 7.0 1.0 5.0 3.0 4.0 4.0 4.0

The outlier in Q4 is 33 as according to the Plotly Express box plot the min value is 1 and max value 7

In [ ]:
#new rows for new dataframe
df_psy = df_psy[df_psy['Q4']<8]
df_psy = df_psy[df_psy['Q5']<8]
df_psy = df_psy[df_psy['Q4']<5]
In [ ]:
#plotting numeric variables without outliers
num = df_psy.drop(['married', 'orientation', 'religion'], axis=1)
fig = px.box(num.melt(),y = "value", facet_col='variable', boxmode= 'overlay', color = 'variable')#plotting outliers for all variables
fig.update_yaxes(matches = None)

NON-NUMERIC VARIABLES

In [ ]:
#Analyzing possible outliers in non-numeric variables making sure they are proper
df_psy.married.unique()
Out[ ]:
array(['Never', 'Married', 'Previously', 'Second'], dtype=object)
In [ ]:
df_psy.orientation.unique()
Out[ ]:
array(['New Types', 'Bi', 'Hetero', 'Homo', 'Asexual'], dtype=object)
In [ ]:
df_psy.religion.unique()
Out[ ]:
array(['Christian Other', 'Christian', 'Atheist', 'Protestant',
       'Agnostic', 'Other', 'Jewish', 'Hindu', 'Buddhist', 'Muslim',
       'Mormon', 'Sikh'], dtype=object)
In [ ]:
#changing to a numerical variable using dictionary
matrimonial = {
    'Never' : 1,
    'Previously' : 2,
    'Married' : 3, 
    'Second' : 4
}
df_psy['MarriedN'] = df_psy['married'].map(matrimonial)#builds a new column and maps to the dictionary

sexuality = {
    'New Types' : 1,
    'Bi' : 2,
    'Hetero' : 3, 
    'Homo' : 4,
    'Asexual' : 5
}
df_psy['OrientationN'] = df_psy['orientation'].map(sexuality)#builds a new column and maps to the dictionary

religia = {
    'Christian Other' : 1,
    'Christian' : 2,
    'Atheist' : 3, 
    'Protestant' : 4,
    'Agnostic' : 5,
    'Other' : 6,
    'Jewish' : 7,
    'Hindu' : 8,
    'Buddhist' : 9,
    'Muslim': 10,
    'Mormon' : 11,
    'Sikh' : 12
}
df_psy['ReligiaN'] = df_psy['religion'].map(religia)#builds a new column and maps to the dictionary
In [ ]:
df_psy
Out[ ]:
age married orientation religion Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11 MarriedN OrientationN ReligiaN
1 16 Never New Types Christian Other 6.0 4.0 7.0 4.0 7.0 7.0 1.0 4.0 2.0 4.0 4.0 1 1 1
3 13 Never New Types Christian 1.0 7.0 4.0 4.0 6.0 1.0 6.0 2.0 2.0 2.0 1.0 1 1 2
6 17 Never Bi Christian Other 2.0 6.0 5.0 2.0 1.0 3.0 5.0 1.0 2.0 3.0 2.0 1 2 1
7 29 Never Bi Atheist 7.0 4.0 5.0 2.0 2.0 3.0 5.0 1.0 1.0 1.0 1.0 1 2 3
10 15 Never Hetero Protestant 5.0 6.0 6.0 4.0 4.0 7.0 5.0 3.0 2.0 3.0 2.0 1 3 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
39765 22 Never Hetero Muslim 1.0 7.0 2.0 4.0 6.0 1.0 4.0 1.0 1.0 2.0 3.0 1 3 10
39771 27 Never Hetero Christian 6.0 6.0 7.0 4.0 6.0 5.0 5.0 2.0 1.0 2.0 2.0 1 3 2
39772 16 Never Hetero Agnostic 6.0 4.0 5.0 3.0 5.0 2.0 5.0 3.0 2.0 2.0 4.0 1 3 5
39775 48 Married Hetero Christian Other 6.0 7.0 5.0 3.0 6.0 1.0 5.0 2.0 2.0 1.0 2.0 3 3 1
39777 26 Never Hetero Muslim 6.0 3.0 5.0 3.0 5.0 5.0 1.0 2.0 2.0 1.0 3.0 1 3 10

14313 rows × 18 columns

In [ ]: